library(knitr)
## Warning: package 'knitr' was built under R version 4.0.5

PROBLEM DEFINITION

a) Specifying the Question

Identify anomalies in the dataset = fraud detection

b) Defining the metrics for success

check whether there are any anomalies in the given sales dataset. The objective of this task being fraud detection.

c) Understanding the context

You are a Data analyst at Carrefour Kenya and are currently undertaking a project that will inform the marketing department on the most relevant marketing strategies that will result in the highest no. of sales (total price including tax). Your project has been divided into four parts where you’ll explore a recent marketing dataset by performing various unsupervised learning techniques and later providing recommendations based on your insights.

d) Recording the Experimental Design

Define the question, the metric for success, the context, experimental design taken. Read and explore the given dataset. Identify anomalies in the dataset = fraud detection

e) Relevance of the data

The data used for this project will inform the marketing department on the most relevant marketing strategies that will result in the highest no. of sales (total price including tax)

[http://bit.ly/CarreFourSalesDataset].

Data Analysis

Loading the required packages

#install.packages("anomalize") # Anormally detection
library(anomalize)
## Warning: package 'anomalize' was built under R version 4.0.5
## == Use anomalize to improve your Forecasts by 50%! =============================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.5
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tibbletime)
## Warning: package 'tibbletime' was built under R version 4.0.5
## 
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
## 
##     filter

Loading the data

anom<-read.csv("C:/Users/Silvia/Downloads/Supermarket_Sales_Forecasting - Sales.csv")

Data Processing

# Previewing the first 6 rows
head(anom)
##        Date    Sales
## 1  1/5/2019 548.9715
## 2  3/8/2019  80.2200
## 3  3/3/2019 340.5255
## 4 1/27/2019 489.0480
## 5  2/8/2019 634.3785
## 6 3/25/2019 627.6165
# Previewing the datatypes of our data
str(anom)
## 'data.frame':    1000 obs. of  2 variables:
##  $ Date : chr  "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Sales: num  549 80.2 340.5 489 634.4 ...
# totalling sales on their common shared dates
anom_aggregate<-aggregate(anom$Sales,by=list(Date=anom$Date),FUN=sum)
head(anom_aggregate)
##        Date        x
## 1  1/1/2019 4745.181
## 2 1/10/2019 3560.949
## 3 1/11/2019 2114.963
## 4 1/12/2019 5184.764
## 5 1/13/2019 2451.204
## 6 1/14/2019 3966.617
#getting a dataframe of the frequency table of Date
date_table<-data.frame(table(anom$Date))
head(date_table)
##        Var1 Freq
## 1  1/1/2019   12
## 2 1/10/2019    9
## 3 1/11/2019    8
## 4 1/12/2019   11
## 5 1/13/2019   10
## 6 1/14/2019   13
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks tibbletime::filter(), stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
# combining both dataframes
final_df<-merge(anom_aggregate,date_table,by.x= "Date", by.y="Var1")
final_df
##         Date         x Freq
## 1   1/1/2019 4745.1810   12
## 2  1/10/2019 3560.9490    9
## 3  1/11/2019 2114.9625    8
## 4  1/12/2019 5184.7635   11
## 5  1/13/2019 2451.2040   10
## 6  1/14/2019 3966.6165   13
## 7  1/15/2019 5944.2600   13
## 8  1/16/2019 4289.0820   10
## 9  1/17/2019 3142.7550   11
## 10 1/18/2019 2780.4735    9
## 11 1/19/2019 4914.7245   16
## 12  1/2/2019 1945.5030    8
## 13 1/20/2019 3655.4490   10
## 14 1/21/2019 2392.0995    8
## 15 1/22/2019 1704.7695    7
## 16 1/23/2019 5994.1875   17
## 17 1/24/2019 5402.0505   13
## 18 1/25/2019 4700.3670   17
## 19 1/26/2019 4457.5125   17
## 20 1/27/2019 4635.8970   14
## 21 1/28/2019 4999.7115   14
## 22 1/29/2019 3516.5655   12
## 23  1/3/2019 2078.1285    8
## 24 1/30/2019 2558.2620    9
## 25 1/31/2019 5232.4965   14
## 26  1/4/2019 1623.6885    6
## 27  1/5/2019 3536.6835   12
## 28  1/6/2019 3614.2050    9
## 29  1/7/2019 2834.2440    9
## 30  1/8/2019 5293.7325   18
## 31  1/9/2019 3021.3435    8
## 32  2/1/2019 2444.5365    6
## 33 2/10/2019 3141.0225   11
## 34 2/11/2019 4542.1530    8
## 35 2/12/2019 2998.9890    8
## 36 2/13/2019  934.2375    8
## 37 2/14/2019 2454.0915    8
## 38 2/15/2019 6830.7855   19
## 39 2/16/2019 2503.7670    8
## 40 2/17/2019 5299.5705   13
## 41 2/18/2019 1496.0295    7
## 42 2/19/2019 4228.1190    9
## 43  2/2/2019 4140.9480   14
## 44 2/20/2019 2706.4170   10
## 45 2/21/2019 1393.7385    6
## 46 2/22/2019 2442.3105   11
## 47 2/23/2019 2339.5890    8
## 48 2/24/2019 2722.4610    9
## 49 2/25/2019 4807.2360   16
## 50 2/26/2019 2408.1645    9
## 51 2/27/2019 5859.4515   14
## 52 2/28/2019 2097.0180    6
## 53  2/3/2019 5467.9275   14
## 54  2/4/2019 2439.4965   11
## 55  2/5/2019 3031.1295   12
## 56  2/6/2019 2905.4235   13
## 57  2/7/2019 7228.2105   20
## 58  2/8/2019 5084.6565   12
## 59  2/9/2019 3271.8945   13
## 60  3/1/2019 2634.3660   10
## 61 3/10/2019 3163.2300   12
## 62 3/11/2019 2961.2520   11
## 63 3/12/2019 3677.5515   12
## 64 3/13/2019 2063.6070   10
## 65 3/14/2019 7214.6340   18
## 66 3/15/2019 2942.4150   12
## 67 3/16/2019 3154.4730    9
## 68 3/17/2019 1976.2890    6
## 69 3/18/2019 1292.8335    7
## 70 3/19/2019 5740.3920   16
## 71  3/2/2019 6560.3055   18
## 72 3/20/2019 5458.2045   15
## 73 3/21/2019 1877.5155    6
## 74 3/22/2019 3179.1480   10
## 75 3/23/2019 4095.0420   11
## 76 3/24/2019 3477.4635   11
## 77 3/25/2019 2272.9665    9
## 78 3/26/2019 1962.5130   13
## 79 3/27/2019 2902.8195   10
## 80 3/28/2019 2229.4020   10
## 81 3/29/2019 4023.2430    8
## 82  3/3/2019 4853.1735   14
## 83 3/30/2019 4487.0595   11
## 84  3/4/2019 3894.4395   12
## 85  3/5/2019 6230.8785   17
## 86  3/6/2019 3092.5965   11
## 87  3/7/2019 1438.2585    9
## 88  3/8/2019 3125.3880   11
## 89  3/9/2019 7474.0470   16
# Renaming columns
names(final_df)<-c("Date","Total.Sales","count")
head(final_df)
##        Date Total.Sales count
## 1  1/1/2019    4745.181    12
## 2 1/10/2019    3560.949     9
## 3 1/11/2019    2114.963     8
## 4 1/12/2019    5184.764    11
## 5 1/13/2019    2451.204    10
## 6 1/14/2019    3966.617    13
#Changing date column to Date format
final_df$Date<-mdy(final_df$Date)
str(final_df)
## 'data.frame':    89 obs. of  3 variables:
##  $ Date       : Date, format: "2019-01-01" "2019-01-10" ...
##  $ Total.Sales: num  4745 3561 2115 5185 2451 ...
##  $ count      : int  12 9 8 11 10 13 13 10 11 9 ...
final_df1 <- final_df %>% select(Date,count)
final_df1
##          Date count
## 1  2019-01-01    12
## 2  2019-01-10     9
## 3  2019-01-11     8
## 4  2019-01-12    11
## 5  2019-01-13    10
## 6  2019-01-14    13
## 7  2019-01-15    13
## 8  2019-01-16    10
## 9  2019-01-17    11
## 10 2019-01-18     9
## 11 2019-01-19    16
## 12 2019-01-02     8
## 13 2019-01-20    10
## 14 2019-01-21     8
## 15 2019-01-22     7
## 16 2019-01-23    17
## 17 2019-01-24    13
## 18 2019-01-25    17
## 19 2019-01-26    17
## 20 2019-01-27    14
## 21 2019-01-28    14
## 22 2019-01-29    12
## 23 2019-01-03     8
## 24 2019-01-30     9
## 25 2019-01-31    14
## 26 2019-01-04     6
## 27 2019-01-05    12
## 28 2019-01-06     9
## 29 2019-01-07     9
## 30 2019-01-08    18
## 31 2019-01-09     8
## 32 2019-02-01     6
## 33 2019-02-10    11
## 34 2019-02-11     8
## 35 2019-02-12     8
## 36 2019-02-13     8
## 37 2019-02-14     8
## 38 2019-02-15    19
## 39 2019-02-16     8
## 40 2019-02-17    13
## 41 2019-02-18     7
## 42 2019-02-19     9
## 43 2019-02-02    14
## 44 2019-02-20    10
## 45 2019-02-21     6
## 46 2019-02-22    11
## 47 2019-02-23     8
## 48 2019-02-24     9
## 49 2019-02-25    16
## 50 2019-02-26     9
## 51 2019-02-27    14
## 52 2019-02-28     6
## 53 2019-02-03    14
## 54 2019-02-04    11
## 55 2019-02-05    12
## 56 2019-02-06    13
## 57 2019-02-07    20
## 58 2019-02-08    12
## 59 2019-02-09    13
## 60 2019-03-01    10
## 61 2019-03-10    12
## 62 2019-03-11    11
## 63 2019-03-12    12
## 64 2019-03-13    10
## 65 2019-03-14    18
## 66 2019-03-15    12
## 67 2019-03-16     9
## 68 2019-03-17     6
## 69 2019-03-18     7
## 70 2019-03-19    16
## 71 2019-03-02    18
## 72 2019-03-20    15
## 73 2019-03-21     6
## 74 2019-03-22    10
## 75 2019-03-23    11
## 76 2019-03-24    11
## 77 2019-03-25     9
## 78 2019-03-26    13
## 79 2019-03-27    10
## 80 2019-03-28    10
## 81 2019-03-29     8
## 82 2019-03-03    14
## 83 2019-03-30    11
## 84 2019-03-04    12
## 85 2019-03-05    17
## 86 2019-03-06    11
## 87 2019-03-07     9
## 88 2019-03-08    11
## 89 2019-03-09    16
# Convert df to a tibble
final_df1 <- as_tibble(final_df1)
class(final_df1)
## [1] "tbl_df"     "tbl"        "data.frame"
df_anomalized <- final_df1 %>%
    time_decompose(count, merge = TRUE) %>%
    anomalize(remainder) %>%
    time_recompose()
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
df_anomalized %>% glimpse()
## Rows: 89
## Columns: 11
## $ Date          <date> 2019-01-01, 2019-01-02, 2019-01-03, 2019-01-04, 2019-01~
## $ count         <int> 12, 8, 8, 6, 12, 9, 9, 18, 8, 9, 8, 11, 10, 13, 13, 10, ~
## $ observed      <dbl> 12, 9, 8, 11, 10, 13, 13, 10, 11, 9, 16, 8, 10, 8, 7, 17~
## $ season        <dbl> 0.883193879, 1.000751930, -2.026508631, 0.003578774, 0.0~
## $ trend         <dbl> 10.03092, 10.16344, 10.29595, 10.42847, 10.56007, 10.691~
## $ remainder     <dbl> 1.0858871, -2.1641886, -0.2694456, 0.5679495, -0.6329071~
## $ remainder_l1  <dbl> -14.79896, -14.79896, -14.79896, -14.79896, -14.79896, -~
## $ remainder_l2  <dbl> 15.41235, 15.41235, 15.41235, 15.41235, 15.41235, 15.412~
## $ anomaly       <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "N~
## $ recomposed_l1 <dbl> -3.884847, -3.634772, -6.529515, -4.366910, -4.166053, -~
## $ recomposed_l2 <dbl> 26.32647, 26.57654, 23.68180, 25.84440, 26.04526, 25.615~

Visualizing the anomalies

df_anomalized %>% plot_anomalies(ncol = 3, alpha_dots = 0.75)

#### Adjusting Trend and Seasonality

p1 <- df_anomalized %>%
    plot_anomaly_decomposition() +
    ggtitle("Freq/Trend = 'auto'")
p1

#When “auto” is used, a get_time_scale_template() is used to #determine the logical frequency and trend spans based on the scale #of the data. You can uncover the logic:

get_time_scale_template()
## # A tibble: 8 x 3
##   time_scale frequency trend   
##   <chr>      <chr>     <chr>   
## 1 second     1 hour    12 hours
## 2 minute     1 day     14 days 
## 3 hour       1 day     1 month 
## 4 day        1 week    3 months
## 5 week       1 quarter 1 year  
## 6 month      1 year    5 years 
## 7 quarter    1 year    10 years
## 8 year       5 years   30 years
  • This implies that if the scale is 1 day (meaning the difference between each data point is 1 day), then the frequency will be 7 days (or 1 week) and the trend will be around 90 days (or 3 months). This logic can be easily adjusted in two ways: Local parameter adjustment & Global parameter adjustment.

Adjusting Local Parameters

p2 <- final_df1 %>%
    time_decompose(count,
                   frequency = "auto",
                   trend     = "2 weeks") %>%
    anomalize(remainder) %>%
    plot_anomaly_decomposition() +
    ggtitle("Trend = 2 Weeks (Local)")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Warning in lubridate::floor_date(x, unit): Multi-unit not supported for weeks.
## Ignoring.
## Warning in lubridate::ceiling_date(x, unit): Multi-unit not supported for weeks.
## Ignoring.
## trend = 14 days
# Show plots
p1

p2

* After adjusting the trend using local parameters we can see some anomalies being detected.

Adjusting the Global Parameter

#Adjusting globally by using set_time_scale_template() to update the #default template to one that we prefer. We’ll change the “3 month” #trend to “2 weeks” for time scale = “day”. Use time_scale_template() #to retrieve the time scale template that anomalize begins with, #mutate() the trend field in the desired location, and use #set_time_scale_template() to update the template in the global #options. We can retrieve the updated template using #get_time_scale_template() to verify the change has been executed #properly.
time_scale_template() %>%
    mutate(trend = ifelse(time_scale == "day", "2 weeks", trend)) %>%
    set_time_scale_template()
get_time_scale_template()
## # A tibble: 8 x 3
##   time_scale frequency trend   
##   <chr>      <chr>     <chr>   
## 1 second     1 hour    12 hours
## 2 minute     1 day     14 days 
## 3 hour       1 day     1 month 
## 4 day        1 week    2 weeks 
## 5 week       1 quarter 1 year  
## 6 month      1 year    5 years 
## 7 quarter    1 year    10 years
## 8 year       5 years   30 years
#plotting to see changes
p3 <- final_df1 %>%
    time_decompose(count) %>%
    anomalize(remainder) %>%
    plot_anomaly_decomposition() +
    ggtitle("Trend = 2 Weeks (Global)")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Warning in lubridate::floor_date(x, unit): Multi-unit not supported for weeks.
## Ignoring.
## Warning in lubridate::ceiling_date(x, unit): Multi-unit not supported for weeks.
## Ignoring.
## trend = 14 days
p3

#Let’s reset the time scale template defaults back to the original #defaults.

time_scale_template() %>%
    set_time_scale_template()
# Verify the change
get_time_scale_template()
## # A tibble: 8 x 3
##   time_scale frequency trend   
##   <chr>      <chr>     <chr>   
## 1 second     1 hour    12 hours
## 2 minute     1 day     14 days 
## 3 hour       1 day     1 month 
## 4 day        1 week    3 months
## 5 week       1 quarter 1 year  
## 6 month      1 year    5 years 
## 7 quarter    1 year    10 years
## 8 year       5 years   30 years

Extracting the Anomalous Data Points

#Now, we can extract the actual datapoints which are anomalies. For #that, the following code can be run.

final_df1 %>% 
  time_decompose(count) %>%
  anomalize(remainder) %>%
  time_recompose() %>%
  filter(anomaly == 'Yes')
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
## # A time tibble: 0 x 10
## # Index: Date
## # ... with 10 variables: Date <date>, observed <dbl>, season <dbl>,
## #   trend <dbl>, remainder <dbl>, remainder_l1 <dbl>, remainder_l2 <dbl>,
## #   anomaly <chr>, recomposed_l1 <dbl>, recomposed_l2 <dbl>
  • As we can see from our table there were no anomalies in the data.

Adjusting Alpha and Max Anoms

  • Alpha
#We can adjust alpha, which is set to 0.05 by default. By default, #the bands just cover the outside of the range.

p4 <- final_df1 %>%
    time_decompose(count) %>%
    anomalize(remainder, alpha = 0.05, max_anoms = 0.2) %>%
    time_recompose() %>%
    plot_anomalies(time_recomposed = TRUE) +
    ggtitle("alpha = 0.05")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
#> frequency = 7 days
#> trend = 91 days
p4

#If we decrease alpha, it increases the bands making it more #difficult to be an outlier. Here, you can see that the bands have #become twice big in size.

p5 <- final_df1 %>%
    time_decompose(count) %>%
    anomalize(remainder, alpha = 0.025, max_anoms = 0.2) %>%
    time_recompose() %>%
    plot_anomalies(time_recomposed = TRUE) +
    ggtitle("alpha = 0.05")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
#> frequency = 7 days
#> trend = 91 days
p5

* Max Anoms

#The max_anoms parameter is used to control the maximum percentage of #data that can be an anomaly. Let’s adjust alpha = 0.3 so pretty much #anything is an outlier. Now let’s try a comparison between max_anoms #= 0.2 (20% anomalies allowed) and max_anoms = 0.05 (5% anomalies #allowed).
p6 <- final_df1 %>%
    time_decompose(count) %>%
    anomalize(remainder, alpha = 0.3, max_anoms = 0.2) %>%
    time_recompose() %>%
    plot_anomalies(time_recomposed = TRUE) +
    ggtitle("20% Anomalies")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
#> frequency = 7 days
#> trend = 91 days
p7 <- final_df1 %>%
    time_decompose(count) %>%
    anomalize(remainder, alpha = 0.3, max_anoms = 0.05) %>%
    time_recompose() %>%
    plot_anomalies(time_recomposed = TRUE) +
    ggtitle("5% Anomalies")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
#> frequency = 7 days
#> trend = 91 days
p6

p7

* Adjusting the max anoms result in presence of anomalies in our data.

Using the ‘timetk’ package

Interactive Anomaly Visualization
#Here, timetk’s plot_anomaly_diagnostics() function makes it possible #to tweak some of the parameters on the fly.
final_df1 %>% timetk::plot_anomaly_diagnostics(Date,count, .facet_ncol = 2)
## frequency = 7 observations per 1 week
## trend = 31 observations per 1 month

Interactive Anomaly Detection

#To find the exact data points that are anomalies, we use #tk_anomaly_diagnostics() function.

final_df1 %>% timetk::tk_anomaly_diagnostics(Date, count) %>% filter(anomaly=='Yes')
## frequency = 7 observations per 1 week
## trend = 31 observations per 1 month
## # A tibble: 0 x 11
## # ... with 11 variables: Date <date>, observed <dbl>, season <dbl>,
## #   trend <dbl>, remainder <dbl>, seasadj <dbl>, remainder_l1 <dbl>,
## #   remainder_l2 <dbl>, anomaly <chr>, recomposed_l1 <dbl>, recomposed_l2 <dbl>

Conclusion